library(cluster)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.3
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.3.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(waffle)
## Warning: package 'waffle' was built under R version 4.3.3
library(dplyr)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
custom_red <- "#f8766d"
custom_blue <- "#00bfc4"
hotel_data <- read.csv("C:/Users/mamid/Downloads/Hotel Reservations.csv")
head(hotel_data)
dim(hotel_data)
## [1] 36275 19
str(hotel_data)
## 'data.frame': 36275 obs. of 19 variables:
## $ Booking_ID : chr "INN00001" "INN00002" "INN00003" "INN00004" ...
## $ no_of_adults : int 2 2 1 2 2 2 2 2 3 2 ...
## $ no_of_children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_weekend_nights : int 1 2 2 0 1 0 1 1 0 0 ...
## $ no_of_week_nights : int 2 3 1 2 1 2 3 3 4 5 ...
## $ required_car_parking_space : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lead_time : int 224 5 1 211 48 346 34 83 121 44 ...
## $ arrival_year : int 2017 2018 2018 2018 2018 2018 2017 2018 2018 2018 ...
## $ arrival_month : int 10 11 2 5 4 9 10 12 7 10 ...
## $ arrival_date : int 2 6 28 20 11 13 15 26 6 18 ...
## $ repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ avg_price_per_room : num 65 106.7 60 100 94.5 ...
## $ no_of_special_requests : int 0 1 0 0 0 1 1 1 1 3 ...
## $ room_type_reserved : chr "Room_Type 1" "Room_Type 1" "Room_Type 1" "Room_Type 1" ...
## $ type_of_meal_plan : chr "Meal Plan 1" "Not Selected" "Meal Plan 1" "Meal Plan 1" ...
## $ market_segment_type : chr "Offline" "Online" "Online" "Online" ...
## $ booking_status : chr "Not_Canceled" "Not_Canceled" "Canceled" "Canceled" ...
summary(hotel_data)
## Booking_ID no_of_adults no_of_children no_of_weekend_nights
## Length:36275 Min. :0.000 Min. : 0.0000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.: 0.0000 1st Qu.:0.0000
## Mode :character Median :2.000 Median : 0.0000 Median :1.0000
## Mean :1.845 Mean : 0.1053 Mean :0.8107
## 3rd Qu.:2.000 3rd Qu.: 0.0000 3rd Qu.:2.0000
## Max. :4.000 Max. :10.0000 Max. :7.0000
## no_of_week_nights required_car_parking_space lead_time arrival_year
## Min. : 0.000 Min. :0.00000 Min. : 0.00 Min. :2017
## 1st Qu.: 1.000 1st Qu.:0.00000 1st Qu.: 17.00 1st Qu.:2018
## Median : 2.000 Median :0.00000 Median : 57.00 Median :2018
## Mean : 2.204 Mean :0.03099 Mean : 85.23 Mean :2018
## 3rd Qu.: 3.000 3rd Qu.:0.00000 3rd Qu.:126.00 3rd Qu.:2018
## Max. :17.000 Max. :1.00000 Max. :443.00 Max. :2018
## arrival_month arrival_date repeated_guest no_of_previous_cancellations
## Min. : 1.000 Min. : 1.0 Min. :0.00000 Min. : 0.00000
## 1st Qu.: 5.000 1st Qu.: 8.0 1st Qu.:0.00000 1st Qu.: 0.00000
## Median : 8.000 Median :16.0 Median :0.00000 Median : 0.00000
## Mean : 7.424 Mean :15.6 Mean :0.02564 Mean : 0.02335
## 3rd Qu.:10.000 3rd Qu.:23.0 3rd Qu.:0.00000 3rd Qu.: 0.00000
## Max. :12.000 Max. :31.0 Max. :1.00000 Max. :13.00000
## no_of_previous_bookings_not_canceled avg_price_per_room no_of_special_requests
## Min. : 0.0000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 80.30 1st Qu.:0.0000
## Median : 0.0000 Median : 99.45 Median :0.0000
## Mean : 0.1534 Mean :103.42 Mean :0.6197
## 3rd Qu.: 0.0000 3rd Qu.:120.00 3rd Qu.:1.0000
## Max. :58.0000 Max. :540.00 Max. :5.0000
## room_type_reserved type_of_meal_plan market_segment_type booking_status
## Length:36275 Length:36275 Length:36275 Length:36275
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
skim(hotel_data)
| Name | hotel_data |
| Number of rows | 36275 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Booking_ID | 0 | 1 | 8 | 8 | 0 | 36275 | 0 |
| room_type_reserved | 0 | 1 | 11 | 11 | 0 | 7 | 0 |
| type_of_meal_plan | 0 | 1 | 11 | 12 | 0 | 4 | 0 |
| market_segment_type | 0 | 1 | 6 | 13 | 0 | 5 | 0 |
| booking_status | 0 | 1 | 8 | 12 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| no_of_adults | 0 | 1 | 1.84 | 0.52 | 0 | 2.0 | 2.00 | 2 | 4 | ▁▂▇▁▁ |
| no_of_children | 0 | 1 | 0.11 | 0.40 | 0 | 0.0 | 0.00 | 0 | 10 | ▇▁▁▁▁ |
| no_of_weekend_nights | 0 | 1 | 0.81 | 0.87 | 0 | 0.0 | 1.00 | 2 | 7 | ▇▃▁▁▁ |
| no_of_week_nights | 0 | 1 | 2.20 | 1.41 | 0 | 1.0 | 2.00 | 3 | 17 | ▇▁▁▁▁ |
| required_car_parking_space | 0 | 1 | 0.03 | 0.17 | 0 | 0.0 | 0.00 | 0 | 1 | ▇▁▁▁▁ |
| lead_time | 0 | 1 | 85.23 | 85.93 | 0 | 17.0 | 57.00 | 126 | 443 | ▇▃▁▁▁ |
| arrival_year | 0 | 1 | 2017.82 | 0.38 | 2017 | 2018.0 | 2018.00 | 2018 | 2018 | ▂▁▁▁▇ |
| arrival_month | 0 | 1 | 7.42 | 3.07 | 1 | 5.0 | 8.00 | 10 | 12 | ▃▃▅▆▇ |
| arrival_date | 0 | 1 | 15.60 | 8.74 | 1 | 8.0 | 16.00 | 23 | 31 | ▇▇▇▆▆ |
| repeated_guest | 0 | 1 | 0.03 | 0.16 | 0 | 0.0 | 0.00 | 0 | 1 | ▇▁▁▁▁ |
| no_of_previous_cancellations | 0 | 1 | 0.02 | 0.37 | 0 | 0.0 | 0.00 | 0 | 13 | ▇▁▁▁▁ |
| no_of_previous_bookings_not_canceled | 0 | 1 | 0.15 | 1.75 | 0 | 0.0 | 0.00 | 0 | 58 | ▇▁▁▁▁ |
| avg_price_per_room | 0 | 1 | 103.42 | 35.09 | 0 | 80.3 | 99.45 | 120 | 540 | ▇▅▁▁▁ |
| no_of_special_requests | 0 | 1 | 0.62 | 0.79 | 0 | 0.0 | 0.00 | 1 | 5 | ▇▁▁▁▁ |
##Checking Null Values & Duplicated Rows
colSums(is.na(hotel_data))
## Booking_ID no_of_adults
## 0 0
## no_of_children no_of_weekend_nights
## 0 0
## no_of_week_nights required_car_parking_space
## 0 0
## lead_time arrival_year
## 0 0
## arrival_month arrival_date
## 0 0
## repeated_guest no_of_previous_cancellations
## 0 0
## no_of_previous_bookings_not_canceled avg_price_per_room
## 0 0
## no_of_special_requests room_type_reserved
## 0 0
## type_of_meal_plan market_segment_type
## 0 0
## booking_status
## 0
duplicate_rows <- duplicated(hotel_data)
hotel_data[duplicate_rows, ]
No duplicate entries found.
sapply(hotel_data, function(x) length(unique(x)))
## Booking_ID no_of_adults
## 36275 5
## no_of_children no_of_weekend_nights
## 6 8
## no_of_week_nights required_car_parking_space
## 18 2
## lead_time arrival_year
## 352 2
## arrival_month arrival_date
## 12 31
## repeated_guest no_of_previous_cancellations
## 2 9
## no_of_previous_bookings_not_canceled avg_price_per_room
## 59 3930
## no_of_special_requests room_type_reserved
## 6 7
## type_of_meal_plan market_segment_type
## 4 5
## booking_status
## 2
table(hotel_data$type_of_meal_plan)
##
## Meal Plan 1 Meal Plan 2 Meal Plan 3 Not Selected
## 27835 3305 5 5130
table(hotel_data$room_type_reserved)
##
## Room_Type 1 Room_Type 2 Room_Type 3 Room_Type 4 Room_Type 5 Room_Type 6
## 28130 692 7 6057 265 966
## Room_Type 7
## 158
table(hotel_data$market_segment_type)
##
## Aviation Complementary Corporate Offline Online
## 125 391 2017 10528 23214
table(hotel_data$booking_status)
##
## Canceled Not_Canceled
## 11885 24390
hotel_data <- subset(hotel_data, select = -Booking_ID)
Transforming the “booking_status” column to “canceled” and using Boolean variable types.
names(hotel_data)[names(hotel_data) == "booking_status"] <- "canceled"
hotel_data$canceled <- ifelse(hotel_data$canceled == "Canceled", TRUE, FALSE)
head(hotel_data["canceled"])
Column type has been transformed to Logical, aka Boolean.
#Transforming Columns to Logical Type
Converting the “repeated_guest” and “required_car_parking_space” columns to Boolean variables.
hotel_data$repeated_guest <- ifelse(hotel_data$repeated_guest == 1, TRUE, FALSE)
hotel_data$required_car_parking_space <- ifelse(hotel_data$required_car_parking_space == 1, TRUE, FALSE)
head(select(hotel_data,repeated_guest,required_car_parking_space))
#Transforming Columns to Numerical Type
#Room Type Reserved Column
Transforming the “room_type_reserved” column into an integer representation of room types by replacing “Room_Type” with an empty character using the gsub() function.
hotel_data$room_type_reserved <- gsub("Room_Type ", "", hotel_data$room_type_reserved) # Replace "Room_Type " with empty char
head(hotel_data["room_type_reserved"])
The column is still Char, updating it to integer type
hotel_data$room_type_reserved <- as.integer(hotel_data$room_type_reserved)
print(typeof(hotel_data$room_type_reserved))
## [1] "integer"
#Type of Meal Plan Column
hotel_data$type_of_meal_plan <- gsub("Not Selected", 0, hotel_data$type_of_meal_plan) # Replace "Not Selected" with 0
hotel_data$type_of_meal_plan <- gsub("Meal Plan ", "", hotel_data$type_of_meal_plan)
hotel_data$type_of_meal_plan <- as.integer(hotel_data$type_of_meal_plan)
head(hotel_data["type_of_meal_plan"])
#Merging Date Columns in a Single One
Introducing a new column, ‘date’, formatted as a Date type. This column will prove valuable for future analytics purposes.
hotel_data <- cbind(hotel_data[, 1:11], date = as.Date(paste(hotel_data$arrival_date, hotel_data$arrival_month, hotel_data$arrival_year, sep="-"), format="%d-%m-%Y"), hotel_data[, 12:ncol(hotel_data)])
subset_data <- subset(hotel_data, is.na(date), c(arrival_year, arrival_month, arrival_date, date))
An issue was discovered in the dataset: February 29th is invalid in 2018 as it was not a leap year. To address this, all rows corresponding to this non-existent date will be removed from the original dataset.
hotel_data <- hotel_data[complete.cases(hotel_data$date), ]
The changes have been implemented successfully, resulting in a dataset containing 25,965 rows.
#Distribution of Canceled Bookings
hotel_data_plot <- ggplot(hotel_data, aes(x = canceled, fill = canceled)) +
geom_bar() +
geom_text(stat='count', aes(label=after_stat(count)), vjust=-0.64) +
theme_void() +
guides(fill = "none")
hotel_pie_chart <- ggplot(hotel_data, aes(x = "", fill = canceled)) +
geom_bar(width = 1) +
coord_polar(theta = "y") +
guides(fill = guide_legend(title = "Canceled", ncol = 1)) +
geom_text(aes(label = paste0(round((after_stat(count))/sum(after_stat(count)) * 100, 2), "%")),
stat = "count",
position = position_stack(vjust = 0.5)) +
theme_void() +
theme(legend.position = "bottom")
grid.arrange(hotel_data_plot,
hotel_pie_chart,
ncol = 2, widths = c(4, 3.5), top = "Distribution of Canceled Bookings")
Out of the total number of bookings (25,965), only 7,435 (28.63%) were canceled, while 18,530 (71.37%) reservations were confirmed.
#Variation of the Average Price per Room
ggplot(hotel_data, aes(x = date, y = avg_price_per_room)) +
geom_smooth(method="auto") +
geom_smooth(method="lm",color="red")+
labs(x = "Month", y = "Average Price per Room") +
ggtitle("Variation of Average Price per Room over Time (2017-2018)") +
scale_x_date(date_breaks = "1 month", date_labels = "%m")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
Over time, we see a steady increase in the average accommodation price, with two significant peaks around May/June 2018 and September 2018. Although it peaked much lower than in 2018, there was still a price increase in September 2017. The early months of the year, from January to mid-February, are usually when the prices are lowest. The link between supply and demand is clearly shown in this chart, where prices tend to grow in the summer and around September because of strong demand, but they stay relatively lower at the beginning of the year because of weaker demand.
ggplot(hotel_data, aes(x = date)) +
geom_bar(aes(fill = canceled)) +
geom_density(data = subset(hotel_data, canceled == TRUE), aes(y = after_stat(count)),linewidth=0.8)+
labs(x = "Date", y = "Count", fill = "Canceled") +
ggtitle("Variation of Reservations count over Time (2017-2018)") +
theme(legend.position = "bottom")+
scale_x_date(date_breaks = "1 month", date_labels = "%m")
The graph displays the evolution of reservations over time, encompassing both canceled and uncanceled bookings. It exhibits a pattern akin to that of the average room price variance, which can be attributed to variations in demand throughout the year.
Reservations tend to be accompanied by an increase in cancellations. We see an increase in cancellations beginning in February, which peaks modestly in mid-April, declines slightly in June and July, and peaks significantly in mid-August to mid-September. By year’s conclusion, cancellations begin to decline once more. Furthermore, there aren’t many cancellations between November and January, which suggests a reduced cancellation rate during that time.
hotel_data_plot <- ggplot(hotel_data, aes(x = type_of_meal_plan, fill = canceled)) +
geom_bar(position="dodge") +
labs(x = "", y = "", fill = "Canceled") +
geom_text(stat='count', aes(label=after_stat(count)),position=position_dodge(width = 0.85), vjust=-0.2) +
theme(legend.position = c(0.98, 0.98),
legend.justification = c(1, 1))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
hotel_waffle_chart <- waffle(prop.table(table(hotel_data$type_of_meal_plan)) * 100,rows=11,reverse = TRUE,size=1.5, legend_pos = "bottom") +
theme(legend.direction = "vertical")+
theme(legend.spacing.y = unit(-0.5,"cm"))
grid.arrange(hotel_data_plot, hotel_waffle_chart, ncol = 2, widths = c(2, 1),top="Distribution of Meal Plan Types by Cancellation Status")
The majority of bookings either opt for the first meal plan option or do not select any meal plan at all.
hotel_data_plot <- ggplot(hotel_data, aes(x = room_type_reserved, fill = canceled)) +
geom_bar(position="dodge") +
labs(x = "", y = "", fill = "Canceled") +
geom_text(stat='count', aes(label=after_stat(count)),position=position_dodge(width = 0.9), vjust=-0.5,size =3.1) +
theme(legend.position = c(0.98, 0.98),
legend.justification = c(1, 1))+
scale_x_continuous(breaks = hotel_data$room_type_reserved)
hotel_waffle_chart <- waffle(prop.table(table(hotel_data$room_type_reserved)) * 100,rows=11,reverse = TRUE,size=1.5, legend_pos = "bottom") +
theme(legend.direction = "vertical",
legend.spacing.y = unit(-0.5,"cm"),
legend.title = element_blank(),
legend.text = element_text(size = 10)) +
guides(fill = guide_legend(override.aes = list(size = 3)))
grid.arrange(hotel_data_plot, hotel_waffle_chart, ncol = 2, widths = c(2, 1),top="Distribution of Room Types Reserved by Cancellation Status")
The majority of clients prefer either the first type of rooms or the fourth type.
ggplot(hotel_data, aes(x = lead_time)) +
geom_histogram(binwidth = 10,color = "white",fill=custom_blue) +
labs(x = "Lead Time", y = "Count") +
ggtitle("Variation of Lead Time")
There is an inversely proportional relationship between the lead time and the number of reservations. As the lead time increases, the number of reservations decreases.
#variation of Lead time by Booking status
ggplot(hotel_data, aes(x = lead_time, fill = canceled, group = canceled)) +
geom_density(alpha = 0.8) +
labs(x = "Lead Time", y = "Density", fill = "Canceled") +
ggtitle("Variation of Lead Time by Booking Status")
A discernible pattern suggests that the probability of cancellations rises with increasing lead times. Conversely, shorter lead times typically result in a higher likelihood of confirmed reservations.
histogram_adults_data <- ggplot(hotel_data) +
geom_histogram(aes(x = no_of_adults),binwidth = 1,color="white",fill=custom_red) +
labs( y = "Count",x="") +
ggtitle("Distribution of the Number of Adults") +
theme(text=element_text(size=10))
histogram_children_data <- ggplot(hotel_data) +
geom_histogram(aes(x = no_of_children),binwidth = 1, color="white",fill=custom_blue) +
labs(x = "", y = "") +
coord_cartesian(xlim = c(0, 3)) +
scale_x_continuous(breaks = seq(0, 10, 1)) +
ggtitle("Distribution of the Number of Children") +
theme(text=element_text(size=10))
grid.arrange(histogram_adults_data, histogram_children_data, nrow = 1)
The majority of bookings consist of 2 adults and no children.
hist_weekends_night <- ggplot(hotel_data) +
geom_histogram(aes(x = no_of_weekend_nights), binwidth = 1, color = "white",fill=custom_red) +
labs(y = "Count", x = "") +
coord_cartesian(xlim = c(0, 5)) +
ggtitle("Distribution of Number of Weekend Nights") +
theme(plot.title = element_text(size = 11))
hist_weekdays_nights <- ggplot(hotel_data) +
geom_histogram(aes(x = no_of_week_nights), binwidth = 1, color = "white",fill=custom_blue) +
labs(x = "", y = "") +
coord_cartesian(xlim = c(0, 11)) +
ggtitle("Distribution of Number of Week Nights") +
theme(plot.title = element_text(size = 11))
grid.arrange(hist_weekends_night, hist_weekdays_nights, nrow = 1)
According to the data, a sizable portion of reservations only include weekday stays of one to three days and do not include weekend nights. On the other hand, a sizable percentage of reservations are for the full weekend, suggesting that longer weekend vacations are preferred.
ggplot(hotel_data, aes(x = no_of_special_requests)) +
geom_histogram(binwidth = 1,color = "white",fill=custom_blue) +
labs(x = "Number of Special Requests", y = "Count") +
scale_x_continuous(breaks = seq(0, max(hotel_data$no_of_special_requests), 1)) +
ggtitle("Variation of Special Requests count")
Most customers usually don’t have any particular requests when they make a reservation. A tiny fraction, meanwhile, might have one or two exceptional needs, and in extreme circumstances, up to five special requirements.
hotel_data_plot_repeated_guest <- ggplot(hotel_data, aes(x = repeated_guest, fill = repeated_guest)) +
geom_bar() +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.64) +
scale_fill_manual(values = c(custom_red,custom_blue)) +
theme_void() +
theme(legend.position = "none")
hotel_pie_chart_repeated_guest <- ggplot(hotel_data, aes(x = "", fill = repeated_guest)) +
geom_bar(width = 1) +
coord_polar(theta = "y") +
guides(fill = guide_legend(title = "Repeated Guest", ncol = 1)) +
geom_text(aes(label = paste0(round((after_stat(count)) / sum(after_stat(count)) * 100, 2), "%")),
stat = "count",
position = position_stack(vjust = 0.5)) +
theme_void() +
theme(legend.position = "bottom")
grid.arrange(hotel_data_plot_repeated_guest,
hotel_pie_chart_repeated_guest,
ncol = 2, widths = c(4, 3.5), top = "Distribution of Repeated Guest")
Since they make up about 96.7% of all guests, it is clear from the data that most of them are first-time guests at the hotel. Just 3.3% of the guests are repeat customers who have stayed at the hotel before.
numerical_data <- hotel_data %>%
select_if(is.numeric)
numerical_data <- hotel_data[, sapply(hotel_data, is.numeric)]
numerical_data <- Filter(is.numeric, hotel_data)
summary(numerical_data)
## no_of_adults no_of_children no_of_weekend_nights no_of_week_nights
## Min. :0.000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 1.000
## Median :2.000 Median : 0.0000 Median :1.0000 Median : 2.000
## Mean :1.845 Mean : 0.1052 Mean :0.8105 Mean : 2.204
## 3rd Qu.:2.000 3rd Qu.: 0.0000 3rd Qu.:2.0000 3rd Qu.: 3.000
## Max. :4.000 Max. :10.0000 Max. :7.0000 Max. :17.000
## lead_time arrival_year arrival_month arrival_date
## Min. : 0.00 Min. :2017 Min. : 1.000 Min. : 1.00
## 1st Qu.: 17.00 1st Qu.:2018 1st Qu.: 5.000 1st Qu.: 8.00
## Median : 57.00 Median :2018 Median : 8.000 Median :16.00
## Mean : 85.28 Mean :2018 Mean : 7.429 Mean :15.58
## 3rd Qu.:126.00 3rd Qu.:2018 3rd Qu.:10.000 3rd Qu.:23.00
## Max. :443.00 Max. :2018 Max. :12.000 Max. :31.00
## no_of_previous_cancellations no_of_previous_bookings_not_canceled
## Min. : 0.00000 Min. : 0.000
## 1st Qu.: 0.00000 1st Qu.: 0.000
## Median : 0.00000 Median : 0.000
## Mean : 0.02335 Mean : 0.153
## 3rd Qu.: 0.00000 3rd Qu.: 0.000
## Max. :13.00000 Max. :58.000
## avg_price_per_room no_of_special_requests room_type_reserved type_of_meal_plan
## Min. : 0.00 Min. :0.00 Min. :1.000 Min. :0.0000
## 1st Qu.: 80.30 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:1.0000
## Median : 99.45 Median :0.00 Median :1.000 Median :1.0000
## Mean :103.44 Mean :0.62 Mean :1.708 Mean :0.9499
## 3rd Qu.:120.00 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :540.00 Max. :5.00 Max. :7.000 Max. :3.0000
standardised_data <- scale(numerical_data)
correlation_hotel_data <- round(cor(numerical_data), 2)
melted_cormat <- melt(correlation_hotel_data)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = custom_blue, high = custom_red,
limit = c(-1,1), name="Correlation") +
theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
geom_text(aes(Var2, Var1, label = value),size = 2) +
labs(x = NULL, y = NULL)
pca <- PCA(standardised_data)
variance <- get_pca_var(pca)
fviz_pca_var(pca, col.var="contrib", gradient.cols = c("black","yellow","purple","red","blue","green","pink","violet","brown","orange"),ggrepel = TRUE ) + labs( title = "PCA Variable Variance")
The correlation calculations clearly show that the target column “canceled” is positively correlated with “lead_time,” “market_segment_type,” and “avg_price_per_room.” On the other hand, “repeated_guest” and “no_of_special_requests” show a negative correlation with the goal feature. Remarkably, though, “type_of_meal_plan,” “arrival_date,” and “arrival_month” exhibit little to no impact on the customer’s choice to cancel their reservation.
hotel_data$market_segment_type <- as.factor(hotel_data$market_segment_type)
hotel_data$canceled <- as.factor(hotel_data$canceled)
groups <- dummyVars(~ market_segment_type + canceled, data = hotel_data)
hotel_data <- cbind(hotel_data, as.data.frame(predict(groups, hotel_data)))
hotel_data[, -c(19)]
numeric_data <- hotel_data %>%
select_if(is.numeric)
numeric_data <- hotel_data[, sapply(hotel_data, is.numeric)]
numeric_data <- Filter(is.numeric, hotel_data)
numeric_data <- numeric_data[, -c(20)]
summary(numeric_data)
## no_of_adults no_of_children no_of_weekend_nights no_of_week_nights
## Min. :0.000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 1.000
## Median :2.000 Median : 0.0000 Median :1.0000 Median : 2.000
## Mean :1.845 Mean : 0.1052 Mean :0.8105 Mean : 2.204
## 3rd Qu.:2.000 3rd Qu.: 0.0000 3rd Qu.:2.0000 3rd Qu.: 3.000
## Max. :4.000 Max. :10.0000 Max. :7.0000 Max. :17.000
## lead_time arrival_year arrival_month arrival_date
## Min. : 0.00 Min. :2017 Min. : 1.000 Min. : 1.00
## 1st Qu.: 17.00 1st Qu.:2018 1st Qu.: 5.000 1st Qu.: 8.00
## Median : 57.00 Median :2018 Median : 8.000 Median :16.00
## Mean : 85.28 Mean :2018 Mean : 7.429 Mean :15.58
## 3rd Qu.:126.00 3rd Qu.:2018 3rd Qu.:10.000 3rd Qu.:23.00
## Max. :443.00 Max. :2018 Max. :12.000 Max. :31.00
## no_of_previous_cancellations no_of_previous_bookings_not_canceled
## Min. : 0.00000 Min. : 0.000
## 1st Qu.: 0.00000 1st Qu.: 0.000
## Median : 0.00000 Median : 0.000
## Mean : 0.02335 Mean : 0.153
## 3rd Qu.: 0.00000 3rd Qu.: 0.000
## Max. :13.00000 Max. :58.000
## avg_price_per_room no_of_special_requests room_type_reserved type_of_meal_plan
## Min. : 0.00 Min. :0.00 Min. :1.000 Min. :0.0000
## 1st Qu.: 80.30 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:1.0000
## Median : 99.45 Median :0.00 Median :1.000 Median :1.0000
## Mean :103.44 Mean :0.62 Mean :1.708 Mean :0.9499
## 3rd Qu.:120.00 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :540.00 Max. :5.00 Max. :7.000 Max. :3.0000
## market_segment_type.Aviation market_segment_type.Complementary
## Min. :0.000000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.000000 Median :0.00000
## Mean :0.003449 Mean :0.01076
## 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :1.000000 Max. :1.00000
## market_segment_type.Corporate market_segment_type.Offline
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.05549 Mean :0.2902
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000
## market_segment_type.Online canceled.TRUE
## Min. :0.00 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.0000
## Median :1.00 Median :0.0000
## Mean :0.64 Mean :0.3278
## 3rd Qu.:1.00 3rd Qu.:1.0000
## Max. :1.00 Max. :1.0000
correlation_data <- round(cor(numeric_data), 2)
melted_cormat_2 <- melt(correlation_data)
ggplot(data = melted_cormat_2, aes(x=Var1, y=Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = custom_blue, high = custom_red,
limit = c(-1,1), name="Correlation") +
theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
geom_text(aes(Var2, Var1, label = value),size = 2) +
labs(x = NULL, y = NULL)
pca_new <- PCA(numeric_data)
columns_to_extract <- c(12, 5, 11, 6, 20)
DATASET <- data.frame(numeric_data[, columns_to_extract])
features_pca <- PCA(DATASET)
pairs(DATASET)
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(DATASET)
library(class)
## Warning: package 'class' was built under R version 4.3.2
library(caret)
# Split the dataset into training and testing sets
set.seed(123) # For reproducibility
train_index <- sample(1:nrow(DATASET), 0.7 * nrow(DATASET)) # 70% for training
train_data <- DATASET[train_index, ]
test_data <- DATASET[-train_index, ]
response_variable_index <- which(names(DATASET) == "canceled.TRUE")
response_variable_index
## [1] 5
# Preprocess the data if necessary (e.g., scaling numeric variables)
# Train the KNN model
k <- 5
# Number of neighbors
knn_model <- knn(train = train_data[, -response_variable_index],
test = test_data[, -response_variable_index],
cl = train_data[, response_variable_index],
k = k)
confusion_matrix_knn <- table(Actual = test_data$canceled.TRUE, Predicted = knn_model)
print(confusion_matrix_knn)
## Predicted
## Actual 0 1
## 0 6569 763
## 1 1307 2233
library(caret)
k_values <- seq(1, 25, by = 2)
train_control <- trainControl(method = "cv", number = 10)
knn_model_results <- train(form = canceled.TRUE ~ .,
data = train_data,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = k_values))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
best_k <- knn_model_results$bestTune$k
final_knn_model <- knn(train = train_data[, -response_variable_index],
test = test_data[, -response_variable_index],
cl = train_data[, response_variable_index],
k = best_k)
best_k
## [1] 9
confusion_matrix_KNN <- table(Actual = test_data$canceled.TRUE, Predicted = final_knn_model)
print(confusion_matrix_KNN)
## Predicted
## Actual 0 1
## 0 6692 640
## 1 1474 2066
plot(final_knn_model)
library(e1071)
# Train Naive Bayes model
naive_bayes_model <- naiveBayes(canceled.TRUE ~ ., data = train_data)
naive_bayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.6712923 0.3287077
##
## Conditional probabilities:
## no_of_special_requests
## Y [,1] [,2]
## 0 0.7608645 0.8353723
## 1 0.3292156 0.5719549
##
## lead_time
## Y [,1] [,2]
## 0 59.2170 64.10792
## 1 139.4872 98.42692
##
## avg_price_per_room
## Y [,1] [,2]
## 0 99.99582 35.58241
## 1 110.55594 32.55786
##
## arrival_year
## Y [,1] [,2]
## 0 2017.775 0.4176603
## 1 2017.919 0.2733338
# Make predictions on the test dataset
predictions <- predict(naive_bayes_model, newdata = test_data)
# Build confusion matrix
confusion_matrix_nb <- table(Actual = test_data$canceled.TRUE, Predicted = predictions)
print(confusion_matrix_nb)
## Predicted
## Actual 0 1
## 0 6543 789
## 1 1695 1845
predicted_prob <- predict(naive_bayes_model, newdata = test_data, type="raw")
predicted_class <- predict(naive_bayes_model, newdata = test_data)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(test_data$canceled.TRUE,predicted_prob[,1])
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
##
## Call:
## roc.default(response = test_data$canceled.TRUE, predictor = predicted_prob[, 1])
##
## Data: predicted_prob[, 1] in 7332 controls (test_data$canceled.TRUE 0) > 3540 cases (test_data$canceled.TRUE 1).
## Area under the curve: 0.8022
plot.roc(test_data$canceled.TRUE,predicted_prob[,1],print.thres="best")
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
# Naive Bayes model
# Calculate accuracy
accuracy_nb <- sum(diag(confusion_matrix_nb)) / sum(confusion_matrix_nb)
# Calculate precision
precision_nb <- confusion_matrix_nb[2, 2] / sum(confusion_matrix_nb[, 2])
# Calculate recall (sensitivity)
recall_nb <- confusion_matrix_nb[2, 2] / sum(confusion_matrix_nb[2, ])
# Calculate F1 score
f1_score_nb <- 2 * (precision_nb * recall_nb) / (precision_nb + recall_nb)
# Print metrics for Naive Bayes model
cat("Naive Bayes Model:\n")
## Naive Bayes Model:
cat("Accuracy:", accuracy_nb, "\n")
## Accuracy: 0.7715232
cat("Precision:", precision_nb, "\n")
## Precision: 0.7004556
cat("Recall (Sensitivity):", recall_nb, "\n")
## Recall (Sensitivity): 0.5211864
cat("F1 Score:", f1_score_nb, "\n\n")
## F1 Score: 0.5976676
# KNN model
# Calculate accuracy
accuracy_knn <- sum(diag(confusion_matrix_KNN)) / sum(confusion_matrix_KNN)
# Calculate precision
precision_knn <- confusion_matrix_KNN[2, 2] / sum(confusion_matrix_KNN[, 2])
# Calculate recall (sensitivity)
recall_knn <- confusion_matrix_KNN[2, 2] / sum(confusion_matrix_KNN[2, ])
# Calculate F1 score
f1_score_knn <- 2 * (precision_knn * recall_knn) / (precision_knn + recall_knn)
# Print metrics for KNN model
cat("KNN Model:\n")
## KNN Model:
cat("Accuracy:", accuracy_knn, "\n")
## Accuracy: 0.8055556
cat("Precision:", precision_knn, "\n")
## Precision: 0.7634885
cat("Recall (Sensitivity):", recall_knn, "\n")
## Recall (Sensitivity): 0.5836158
cat("F1 Score:", f1_score_knn, "\n")
## F1 Score: 0.6615434
#comparing the model performance
comparison_df <- data.frame(
Classifier = c("KNN", "Naive Bayes"),
Accuracy = c(accuracy_knn, accuracy_nb),
Precision = c(precision_knn, precision_nb),
Recall = c(recall_knn, recall_nb),
F1_Score = c(f1_score_knn, f1_score_nb)
)
print(comparison_df)
## Classifier Accuracy Precision Recall F1_Score
## 1 KNN 0.8055556 0.7634885 0.5836158 0.6615434
## 2 Naive Bayes 0.7715232 0.7004556 0.5211864 0.5976676
library(ggplot2)
library(reshape2)
comparison_df_melted <- melt(comparison_df, id.vars = "Classifier")
ggplot(comparison_df_melted, aes(x = variable, y = value, fill = Classifier)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
labs(title = "Comparison of Classifiers",
x = "Metric",
y = "Value",
fill = "Classifier") +
theme_minimal()